{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 【Task 6】 模型融合（2天）  \n",
    "模型融合方式任意，并结合Task5给出你的最优结果。  \n",
    "时间：2天  \n",
    "例如Stacking融合，用你目前评分最高的模型作为基准模型，和其他模型进行stacking融合，得到最终模型及评分结果。  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
    "from sklearn.metrics import classification_report\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "from pandas import DataFrame,Series\n",
    "\n",
    "df = pd.read_csv('data.csv',encoding = 'gbk')\n",
    "\n",
    "\"\"\"\n",
    "数据处理\n",
    "\"\"\"\n",
    "\n",
    "###删除无关特征###\n",
    "df.drop(['Unnamed: 0','custid','trade_no','bank_card_no' ,'source','id_name'],inplace =True,axis = 1)\n",
    "\n",
    "###数据类型转换###（主要针对 obeject（文字类） \n",
    "df['reg_preference_for_trad'].fillna('其他城市',inplace = True)\n",
    "df['reg_preference_for_trad'].replace({'一线城市':1,'二线城市':2,'三线城市':3,'境外':4,'其他城市':5},inplace = True)\n",
    "\n",
    "# 处理日期格式 'latest_query_time', 'loans_latest_time'(暂时去掉？？？)\n",
    "\n",
    "df.drop(['latest_query_time', 'loans_latest_time'],inplace =True,axis = 1)\n",
    "\n",
    "\n",
    "###缺失值处理###\n",
    "df['student_feature'].fillna(0,inplace=True) \n",
    "for i in df.columns:\n",
    "    df[i].fillna(df[i].mode()[0],inplace = True)  #加[0]是因为众数可能有多个，返回不是一个数字\n",
    "    \n",
    "###切分数据集###\n",
    "\n",
    "y=df['status']\n",
    "x=df.drop('status',axis=1)\n",
    "x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.3,random_state = 2018)   \n",
    "\n",
    "\n",
    "### 归一化 ###\n",
    "features = x_train.columns #目前是83个特征\n",
    "scaler = StandardScaler()\n",
    "x_train = scaler.fit_transform(x_train)\n",
    "x_test = scaler.fit_transform(x_test)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 选三个效果好的模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0    0.81006   0.93446   0.86783      1068\n",
      "           1    0.64103   0.34819   0.45126       359\n",
      "\n",
      "    accuracy                        0.78697      1427\n",
      "   macro avg    0.72555   0.64132   0.65954      1427\n",
      "weighted avg    0.76754   0.78697   0.76303      1427\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\MyCodeEnvironment\\Anaconda3\\envs\\p3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "# LR\n",
    "LR = LogisticRegression()\n",
    "LR.fit(x_train, y_train)\n",
    "lr_y_pred = LR.predict(x_test)\n",
    "ans = classification_report(y_test,lr_y_pred,digits=5)\n",
    "print(ans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0    0.78895   0.96255   0.86714      1068\n",
      "           1    0.67742   0.23398   0.34783       359\n",
      "\n",
      "    accuracy                        0.77926      1427\n",
      "   macro avg    0.73318   0.59827   0.60749      1427\n",
      "weighted avg    0.76089   0.77926   0.73650      1427\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#svm\n",
    "SVM = SVC(gamma='auto',probability=True)\n",
    "SVM.fit(x_train, y_train)\n",
    "svm_y_pred = SVM.predict(x_test)\n",
    "svm_y_pro = SVM.predict_proba(x_test)\n",
    "# print(svm_y_pro)\n",
    "ans = classification_report(y_test,svm_y_pred,digits=5)\n",
    "print(ans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0    0.79694   0.92603   0.85665      1068\n",
      "           1    0.57527   0.29805   0.39266       359\n",
      "\n",
      "    accuracy                        0.76804      1427\n",
      "   macro avg    0.68610   0.61204   0.62465      1427\n",
      "weighted avg    0.74117   0.76804   0.73992      1427\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# lgbm\n",
    "LGBM = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=250,\n",
    "                              max_bin=255, subsample_for_bin=200000, objective=None, min_split_gain=0.0, min_child_weight=0.001,\n",
    "                              min_child_samples=20, subsample=1.0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0,\n",
    "                              reg_lambda=0.5, random_state=None, n_jobs=-1, silent=True)\n",
    "LGBM.fit(x_train, y_train)\n",
    "lgbm_y_pred = LGBM.predict(x_test)\n",
    "\n",
    "ans = classification_report(y_test, lgbm_y_pred,digits=5)\n",
    "print(ans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0    0.79433   0.94382   0.86264      1068\n",
      "           1    0.62025   0.27298   0.37911       359\n",
      "\n",
      "    accuracy                        0.77505      1427\n",
      "   macro avg    0.70729   0.60840   0.62088      1427\n",
      "weighted avg    0.75053   0.77505   0.74100      1427\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# XGB\n",
    "XGB = xgb.XGBClassifier(max_depth=6, num_class =2,learning_rate=0.1, n_estimators=100, silent=True, objective='multi:softmax',\n",
    "                        nthread=32, gamma=0.1, min_child_weight=3, max_delta_step=0, subsample=1, colsample_bytree=1,\n",
    "                        colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=2018,\n",
    "                        missing=None)\n",
    "XGB.fit(x_train, y_train)\n",
    "xgb2_y_pred = XGB.predict(x_test)\n",
    "\n",
    "ans = classification_report(y_test, xgb2_y_pred,digits=5)\n",
    "print(ans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0    0.79709   0.76873   0.78265      1068\n",
      "           1    0.37783   0.41783   0.39683       359\n",
      "\n",
      "    accuracy                        0.68045      1427\n",
      "   macro avg    0.58746   0.59328   0.58974      1427\n",
      "weighted avg    0.69161   0.68045   0.68559      1427\n",
      "\n"
     ]
    }
   ],
   "source": [
    "DT = DecisionTreeClassifier()\n",
    "DT.fit(x_train, y_train)\n",
    "dt_y_pred = DT.predict(x_test)\n",
    "dt_y_pro = DT.predict_proba(x_test)\n",
    "# print(svm_y_pro)\n",
    "ans = classification_report(y_test,dt_y_pred,digits=5)\n",
    "print(ans)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 模型融合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\MyCodeEnvironment\\Anaconda3\\envs\\p3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0    0.79652   0.94195   0.86315      1068\n",
      "           1    0.62195   0.28412   0.39006       359\n",
      "\n",
      "    accuracy                        0.77645      1427\n",
      "   macro avg    0.70923   0.61304   0.62660      1427\n",
      "weighted avg    0.75260   0.77645   0.74413      1427\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from mlxtend.classifier import StackingCVClassifier\n",
    "StackingModel = StackingCVClassifier(classifiers=[XGB,LGBM, SVM],\n",
    "                                   use_probas=True, \n",
    "                                   meta_classifier=LR,\n",
    "                                   cv=5,\n",
    "                                   )\n",
    "StackingModel.fit(x_train, y_train)\n",
    "sm_y_pred = StackingModel.predict(x_test)\n",
    "ans = classification_report(y_test,sm_y_pred,digits=5)\n",
    "print(ans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
